Creating a Hex Bin Map with Pell Grant Data
By Brendan Graham in tidy tuesday data viz hex bin map
September 10, 2022
library(tidytuesdayR)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(purrr)
library(skimr)
library(here)
## here() starts at /Users/brendangraham/projects/brendangraham.online
library(ggrepel)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.1 ✔ rsample 1.1.0
## ✔ dials 1.0.0 ✔ tune 1.0.0
## ✔ infer 1.0.3 ✔ workflows 1.0.0
## ✔ modeldata 1.0.0 ✔ workflowsets 1.0.0
## ✔ parsnip 1.0.1 ✔ yardstick 1.0.0
## ✔ recipes 1.0.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(rules)
##
## Attaching package: 'rules'
##
## The following object is masked from 'package:dials':
##
## max_rules
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: iterators
## Loading required package: parallel
library(ghibli)
library(gghighlight)
library(highcharter)
library(ggsci)
library(usdata)
source(here::here("content", "blog", "util.R"))
Get the data
First I read in the data and do some minor cleaning:
* only include data post-1999 since it looks like 1999 isn’t a full
year of data. * overwrite the state_name variable with the full state
name instead of the abbreviation. This makes creating a hex bin map
later on a little easier. One consequence of this is Puerto Rico and
other US Territories are excluded
tt_url <-
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-08-30/pell.csv"
pell <-
readr::read_csv(tt_url)
## Rows: 100474 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): STATE, NAME, SESSION
## dbl (3): AWARD, RECIPIENT, YEAR
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(pell)
| Name | pell |
| Number of rows | 100474 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Table 1: Data summary
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| STATE | 0 | 1 | 2 | 2 | 0 | 59 | 0 |
| NAME | 0 | 1 | 3 | 70 | 0 | 10731 | 0 |
| SESSION | 0 | 1 | 7 | 7 | 0 | 19 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| AWARD | 4 | 1 | 3950027.16 | 12022838.13 | 0 | 268390.2 | 1060252 | 3603497 | 1208452160 | ▇▁▁▁▁ |
| RECIPIENT | 0 | 1 | 1249.81 | 3517.14 | 0 | 94.0 | 370 | 1246 | 304583 | ▇▁▁▁▁ |
| YEAR | 0 | 1 | 2008.26 | 5.35 | 1999 | 2004.0 | 2008 | 2013 | 2017 | ▇▇▆▇▇ |
pell <-
readr::read_csv(tt_url) %>%
janitor::clean_names() %>%
filter(year > 1999) %>%
mutate(state_name = usdata::abbr2state(abbr = state))
## Rows: 100474 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): STATE, NAME, SESSION
## dbl (3): AWARD, RECIPIENT, YEAR
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pell %>%
get_table()
Explore the data
It looks like some time around 2009 the total dollar amount of Pell grants increased and remained at a higher level the prior to 2009. The volume of grants remained constant though, so we see the impact of this increase in the average amount per grant.
ggpubr::ggarrange(
pell %>%
group_by(year) %>%
tally() %>%
ggplot(., aes(x = year, y = n)) +
geom_col(fill = bg_red) +
bg_theme(base_size = 12, plot_title_size = 12) +
labs(x = "Year", y = "Count", title = "Total Pell Grant Volume per Year"),
pell %>%
group_by(year) %>%
summarise(total_amt = sum(award, na.rm = T)) %>%
ggplot(., aes(x = year, y = total_amt)) +
geom_col(fill = bg_green) +
bg_theme(base_size = 12, plot_title_size = 12) +
scale_y_continuous(labels = dollar) +
labs(x = "Year", y = "", title = "Total Pell Grant Dollar Amount per Year"),
pell %>%
group_by(year) %>%
summarise(mean_amt = mean(award, na.rm = T)) %>%
ggplot(., aes(x = year, y = mean_amt)) +
geom_col(fill = bg_blue) +
bg_theme(base_size = 12, plot_title_size = 12) +
scale_y_continuous(labels = dollar) +
labs(x = "Year", y = "", title = "Mean Pell Grant Dollar Amount per Year"),
nrow = 3,
ncol = 1
)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'RobotoMono-Regular' not found in PostScript font database

Looking state by states we can see not all states behave the same. Some states spike post 2009 and come down to a new baseline level like AZ, while some increase and remain steady at a new level, like CA.
pell %>%
group_by(state, year) %>%
summarise(mean_amt = mean(award, na.rm = T)) %>%
ungroup() %>%
ggplot(., aes(x = year, y = mean_amt, color = state)) +
geom_line() +
geom_point() +
scale_y_continuous(labels = dollar) +
gghighlight(label_key = state, use_direct_label = T, state %in% c("CA", "AZ"), use_group_by = FALSE) +
labs(x = "Year", y = "Amount ($)", title = "Mean Pell Grant Dollar Amount per Year") +
scale_color_npg() +
bg_theme()
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.

Digging into this post 2009 increase some more, there is some evidence that the policy changes as a result of the 2008 financial crisis can explain some of what we’re seeing in the data.
Specifically, that document notes that
the recession of 2007–2009 and the subsequent slow recovery drew more students into the recipient pool. Eligibility increased as adult students and the families of dependent students experienced losses in income and assets; enrollment of eligible students also rose as people who had lost jobs sought to acquire new skills and people who would have entered the workforce enrolled in school because they could not find employment. The expansion of online education, particularly at for-profit institutions, attracted still more students, many of whom were eligible for Pell grant.
Visualizing the Change
This plot ranks states in order of change post-2009, with Arizona leading the way.
# https://www.cbo.gov/sites/default/files/cbofiles/attachments/44448_PellGrants_9-5-13.pdf
time_prep <-
pell %>%
mutate(timeframe = ifelse(year < 2009, "pre_2009", "post_2009")) %>%
group_by(year, state_name, timeframe) %>%
summarise(total = n()) %>%
ungroup() %>%
group_by(state_name, timeframe) %>%
mutate(mean_grants = mean(total)) %>%
select(state_name, timeframe, mean_grants)
## `summarise()` has grouped output by 'year', 'state_name'. You can override
## using the `.groups` argument.
pell_time_state_comparison <-
pell %>%
mutate(timeframe = ifelse(year < 2009, "pre_2009", "post_2009")) %>%
group_by(state_name, timeframe) %>%
summarise(total_amt = sum(award, na.rm = T),
mean_amt = mean(award, na.rm = T),
sd_amt = sd(award, na.rm = T)) %>%
left_join(., time_prep, by = c("state_name", "timeframe")) %>%
distinct()
## `summarise()` has grouped output by 'state_name'. You can override using the
## `.groups` argument.
pell_time_state_comparison %>%
select(state_name, mean_amt, timeframe) %>%
pivot_wider(names_from = timeframe, values_from = mean_amt) %>%
mutate(diff = post_2009 - pre_2009) %>%
filter(!(is.na(state_name))) %>%
ggplot(., aes(x = reorder(state_name, diff), y = diff, label = paste0("$", round(diff/100000,2)), fill = log(diff))) +
scale_y_continuous(labels = scales::dollar) +
scale_fill_gradient(low = "cornflowerblue", high = bg_green) +
geom_col(show.legend = FALSE) +
coord_flip() +
bg_theme(base_size = 16) +
geom_text(hjust = 1.05, family = 'RobotoMono-Regular', size = 3, color = 'white') +
labs(y = "Amount($)", x = "State", title =" States Ranked by Post-2009 Change in Mean Grant Amount")

We can also visualize these increases using a hex bin map, in which each state is represented by a hexagon , color coded with the post-2009 increase in avg Pell grant money received.
# https://r-graph-gallery.com/328-hexbin-map-of-the-usa.html
# Download the Hexagones boundaries at geojson format here:
# https://team.carto.com/u/andrew/tables/andrew.us_states_hexgrid/public/map.
library(tidyverse)
library(geojsonio)
## Registered S3 method overwritten by 'geojsonsf':
## method from
## print.geojson geojson
##
## Attaching package: 'geojsonio'
## The following object is masked from 'package:base':
##
## pretty
library(RColorBrewer)
library(rgdal)
## Loading required package: sp
## Please note that rgdal will be retired by the end of 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
##
## rgdal: version: 1.5-32, (SVN revision 1176)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.4.2, released 2022/03/08
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/sf/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ runtime: Rel. 8.2.1, January 1st, 2022, [PJ_VERSION: 821]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.5-0
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
# Download the Hexagones boundaries at geojson format here: https://team.carto.com/u/andrew/tables/andrew.us_states_hexgrid/public/map.
# Load this file. (Note: I stored in a folder called DATA)
spdf <-
geojson_read(here::here("content", "blog", "pell", "us_states_hexgrid.geojson"),
what = "sp")
# Bit of reformating
spdf@data <-
spdf@data %>%
mutate(google_name = gsub(" \\(United States\\)", "", google_name))
# Show framework
# plot(spdf)
library(broom)
spdf_fortified <-
tidy(spdf, region = "google_name")
library(rgeos)
## rgeos version: 0.5-9, (SVN revision 684)
## GEOS runtime version: 3.10.2-CAPI-1.16.0
## Please note that rgeos will be retired by the end of 2023,
## plan transition to sf functions using GEOS at your earliest convenience.
## GEOS using OverlayNG
## Linking to sp version: 1.4-7
## Polygon checking: TRUE
##
## Attaching package: 'rgeos'
## The following object is masked from 'package:parsnip':
##
## translate
centers <-
cbind.data.frame(data.frame(gCentroid(spdf, byid=TRUE), id=spdf@data$iso3166_2))
ggplot() +
geom_polygon(data = spdf_fortified, aes( x = long, y = lat, group = group), fill="skyblue", color="white") +
geom_text(data=centers, aes(x=x, y=y, label=id)) +
theme_void() +
coord_map()

# Merge geospatial and numerical information
spdf_fortified <-
spdf_fortified %>%
left_join(. , pell_time_state_comparison %>%
select(state_name, mean_amt, timeframe) %>%
pivot_wider(names_from = timeframe, values_from = mean_amt) %>%
mutate(diff = post_2009 - pre_2009), by = c("id" = "state_name"))
# Make a first chloropleth map
ggplot() +
geom_polygon(data = spdf_fortified, aes(fill = diff, x = long, y = lat, group = group)) +
scale_fill_gradient(trans = "log") +
theme_void() +
coord_map()

hist(spdf_fortified$diff)

# Prepare binning
spdf_fortified$bin <-
cut(spdf_fortified$diff,
breaks =c(seq(0, 12000000, 2000000), Inf),
labels = c("0-$2 mil", "$2-$4 mil", "$4-$6 mil", "$6-$8 mil", "$8-$10 mil", "$10-$12 mil", "$12+ mil"),
include.lowest = TRUE)
# Prepare a color scale coming from the viridis color palette
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
my_palette <-
viridis::inferno(6)
ggplot() +
geom_polygon(data = spdf_fortified, aes(fill = bin, x = long, y = lat, group = group)) +
geom_text(data=centers, aes(x=x, y=y, label=id), color="white", size=3, alpha=0.6) +
theme_void() +
coord_map() +
scale_fill_manual(
values = my_palette,
name = "Change in Pell Grants $millions",
guide = guide_legend(keyheight = unit(3, units = "mm"),
keywidth=unit(12, units = "mm"),
label.position = "bottom",
title.position = 'top', nrow=1)
) +
ggtitle("Change in Mean Pell Grants by State\nPre/Post 2009") +
theme(
legend.position = c(0.5, 0.9),
text = element_text(color = "#22211d"),
plot.background = element_rect(fill = "#f5f5f2", color = NA),
panel.background = element_rect(fill = "#f5f5f2", color = NA),
legend.background = element_rect(fill = "#f5f5f2", color = NA),
plot.title = element_text(size= 22, hjust=0.5, color = "#4e4d47", margin = margin(b = -0.1, t = 0.4, l = 2, unit = "cm")),
)

- Posted on:
- September 10, 2022
- Length:
- 1350 minute read, 287499 words
- Categories:
- tidy tuesday data viz hex bin map
- Tags:
- tidy tuesday data viz hex bin map